all_meta <- read_csv(file = here::here('data/drawings/stringent_cleaned_dataset_meta/all_object_metadata_cleaned.csv')) %>%
as_tibble() %>%
# filter(age_numeric>2) %>%
mutate(category = str_split_fixed(category,' ',2)[,2]) %>%
mutate(category = str_replace(category,' ','.')) # ice cream
## Parsed with column specification:
## cols(
## session_id = col_character(),
## category = col_character(),
## age = col_character(),
## num_strokes = col_double(),
## draw_duration_old = col_double(),
## draw_duration_new = col_double(),
## mean_intensity = col_double(),
## age_numeric = col_double(),
## filename = col_character()
## )
num_subs = length(unique(all_meta$session_id))
The final, filtered dataset of N=37770 drawings from 48 categories from 8084 children who were on average 5.3306328years of age (range 3-10 years).
frequency = read_csv(file = here::here('data/surveys/drawing_experience/preprocessed/Category_frequency_survey.csv'))
## Parsed with column specification:
## cols(
## childs_age = col_double(),
## subject_id = col_character(),
## category = col_character(),
## often_drawn_rating = col_double()
## )
To assess this, 51 parents of children aged 3-10 years filled out a survey asking about the frequency with with their children drew the categories in the dataset.
count_by_age <- frequency %>%
group_by(childs_age) %>%
dplyr::summarize(num_surveys = length(unique(subject_id)))
## `summarise()` ungrouping output (override with `.groups` argument)
freq_by_category <- frequency %>%
# filter(childs_age > 2) %>%
mutate(category = str_split_fixed(category,' ',2)[,2]) %>%
mutate(category = str_replace(category,' ','.')) %>%
filter(category %in% all_meta$category) %>%
group_by(category) %>%
summarize(drawing_frequency = mean(often_drawn_rating)) %>%
mutate(above_median_freq = drawing_frequency > median(drawing_frequency))
## `summarise()` ungrouping output (override with `.groups` argument)
# write_csv(freq_by_category, here::here('data/surveys/drawing_experience/preprocessed/freq_by_category.csv'))
all_tracings <- read_csv(here('data/tracing/rated_all_museumstation_filtered.csv'))%>%
select(-X1, -X)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X = col_double(),
## session_id = col_character(),
## age = col_double(),
## category = col_character(),
## pre_tran = col_double(),
## post_tran = col_double(),
## rotate = col_double(),
## translate = col_double(),
## scale = col_double(),
## has_ref = col_logical(),
## rating = col_double()
## )
## Make averages for joining
by_subject_tracing_avg <- all_tracings %>%
distinct(session_id, category, age, rating) %>%
group_by(session_id) %>%
summarize(avg_tracing_rating = mean(rating))
## `summarise()` ungrouping output (override with `.groups` argument)
animacy_csv <- read_csv(here::here('data/drawings/category_metadata/animacy.csv')) %>%
as_tibble() %>%
mutate(animacy_size = case_when(animacy == '0' & size=='0' ~ 2,
animacy == '0' & size=='1' ~ 1,
animacy == '1' & size=='0' ~ 3,
animacy == '1' & size=='1' ~ 4))
## Parsed with column specification:
## cols(
## animacy = col_double(),
## size = col_double(),
## category = col_character(),
## vehicle = col_double()
## )
num_batches=232
reg_string = 'C_0.1_T_0.1'
classification_data <- read.csv(here::here('data','compiled_classifications/',paste0(reg_string, 'batchtotal_',as.character(num_batches),'.csv'))) %>%
mutate(session_id = paste('cdm_',session_id,sep="")) %>%
mutate(age_numeric = age) %>%
mutate(age = paste('age',age,sep="")) %>%
mutate(age = as.factor(age)) %>%
mutate(category = target_label) %>%
mutate(image_name = paste(target_label,'_sketch_', age,'_', session_id,'.png',sep="")) %>%
select(-X) %>%
mutate(category = str_replace(category,' ','.')) # ice cream = ice.cream
d <- classification_data %>%
mutate(correct_or_not = as.logical(correct_or_not)) %>%
gather(key = 'class', value = 'prob', contains('prob')) %>%
mutate(class = str_split_fixed(class, '_prob',2)[,1]) %>%
group_by(image_name, age, category, correct_or_not, session_id, age_numeric) %>%
summarize(denom = sum(prob), target_label_prob = prob[class==category], log_odds = log(target_label_prob / (denom - target_label_prob))) %>%
rename(filename = image_name) %>%
left_join(all_meta, by=c("filename", "category", "age_numeric","session_id")) %>%
mutate(draw_duration = draw_duration_old) %>%
mutate(run = substr(session_id,0,10)) %>%
left_join(freq_by_category)
## `summarise()` regrouping output by 'image_name', 'age', 'category', 'correct_or_not', 'session_id' (override with `.groups` argument)
## Joining, by = "category"
d <- d %>%
left_join(by_subject_tracing_avg)
## Joining, by = "session_id"
# weird things were happening with category matching, check
assert_that(length(unique(d$filename)) == length(unique(classification_data$image_name)))
## [1] TRUE
# every drawing should have all of these, regardless
assert_that(sum(is.na(d$age_numeric))==0)
## [1] TRUE
assert_that(sum(is.na(d$category))==0)
## [1] TRUE
assert_that(sum(is.na(d$correct_or_not))==0)
## [1] TRUE
missing_meta <- d %>%
filter(is.na(num_strokes))
assert_that(length(missing_meta$filename)==0)
## [1] TRUE
### How do our covariates change with age? Compute means and CIs; Group by age/category
## first summarize data
cor_by_age <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_cor")
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
# cor_by_age_by_session <- d %>%
# group_by(session_id, age_numeric) %>%
# summarize(avg_cor = mean(correct_or_not)) %>%
# group_by(age_numeric) %>%
# multi_boot_standard(col = "avg_cor")
draw_duration <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_draw_duration = mean(draw_duration)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_draw_duration")
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
num_strokes <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_num_strokes = mean(num_strokes)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_num_strokes")
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
avg_intensity <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_intensity = mean(mean_intensity)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_intensity")
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
tracing_scores <- d %>%
distinct(session_id,age_numeric,avg_tracing_rating) %>%
filter(!is.na(avg_tracing_rating)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_tracing_rating")
## Make compiled plot of descriptives
base_size_chosen=12 # size of text in plots
smooth_alpha=.2
cor_by_age_plot_A = ggplot(cor_by_age, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Classification accuracy') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
geom_smooth(col='grey', alpha=smooth_alpha) +
ylim(0,.75) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey") +
ggtitle('A')
p1=ggplot(draw_duration, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Drawing duration (s)') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
ylim(0,15) +
geom_smooth(col='grey', span = 10) +
ggtitle('B')
p2=ggplot(avg_intensity, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Ink used (mean intensity)') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
ylim(.02,.06) +
geom_smooth(col='grey', span = 10,alpha=smooth_alpha) +
ggtitle('C')
p3=ggplot(num_strokes, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Number of strokes') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
ylim(0,20) +
geom_smooth(col='grey', span = 10,alpha=smooth_alpha) +
ggtitle('D')
p4=ggplot(tracing_scores, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Estimated tracing score') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
ylim(0,4) +
geom_smooth(col='grey', span = 10,alpha=smooth_alpha) +
ggtitle('E')
ggarrange(cor_by_age_plot_A,p1,p2,p3,p4, nrow=1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave('figures/mainResults.pdf',width=7.5, height=3, units='in')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
num_bins = 3
cor_by_age_by_strokes <- d %>%
ungroup() %>%
mutate(bin = ntile(num_strokes, num_bins)) %>%
group_by(bin, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric,bin) %>%
multi_boot_standard(col = "avg_cor") %>%
mutate(covariate = 'by strokes drawn')
## `summarise()` regrouping output by 'bin', 'age_numeric' (override with `.groups` argument)
cor_by_age_by_time <- d %>%
ungroup() %>%
mutate(bin = ntile(draw_duration, num_bins)) %>%
group_by(bin, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric,bin) %>%
multi_boot_standard(col = "avg_cor") %>%
mutate(covariate = 'by time spent')
## `summarise()` regrouping output by 'bin', 'age_numeric' (override with `.groups` argument)
cor_by_age_by_intensity <- d %>%
ungroup() %>%
mutate(bin = ntile(mean_intensity, num_bins)) %>%
group_by(bin, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric,bin) %>%
multi_boot_standard(col = "avg_cor") %>%
mutate(covariate = 'by ink used')
## `summarise()` regrouping output by 'bin', 'age_numeric' (override with `.groups` argument)
cor_by_age_by_tracing <- d %>%
ungroup() %>%
filter(!is.na(avg_tracing_rating)) %>%
mutate(bin = ntile(avg_tracing_rating, num_bins)) %>%
group_by(bin, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric,bin) %>%
multi_boot_standard(col = "avg_cor") %>%
mutate(covariate = 'by tracing ability')
## `summarise()` regrouping output by 'bin', 'age_numeric' (override with `.groups` argument)
merged <- cor_by_age_by_intensity %>%
full_join(cor_by_age_by_strokes) %>%
full_join(cor_by_age_by_time) %>%
full_join(cor_by_age_by_tracing) %>%
# filter(age_numeric > 2) %>%
mutate(bin_name = as.numeric(bin))
## Joining, by = c("age_numeric", "bin", "ci_lower", "ci_upper", "mean", "covariate")
## Joining, by = c("age_numeric", "bin", "ci_lower", "ci_upper", "mean", "covariate")
## Joining, by = c("age_numeric", "bin", "ci_lower", "ci_upper", "mean", "covariate")
# mutate(bin_name = case_when(bin == 1 ~ "Low",
# bin == 2 ~ "Medium",
# bin == 3 ~ "High"))
ggplot(merged, aes(age_numeric,mean*100, color=bin_name, group=bin_name, col=bin_name)) +
geom_pointrange(aes(ymin = ci_lower*100, ymax = ci_upper*100), alpha=.6, size=.25) +
theme_few(base_size = 10) +
labs(x='Age of child drawing (yrs)', y='Category classification \n accuracy') +
# scale_color_viridis(option="C", begin=.2, end=.8, discrete=TRUE, name = 'Effort') +
scale_x_continuous(breaks=seq(2,10,1)) +
geom_smooth(span=10, alpha=smooth_alpha, size=.4) +
ylim(0,100) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey") +
theme(legend.position ='none',aspect.ratio = 1) +
scale_fill_discrete(labels=c('Low','Medium','High')) +
# theme(aspect.ratio = 1, legend.position = c(.08, .75),legend.text = element_text(size=6),legend.title = element_text(size=8),legend.background = element_rect(fill=alpha('white', 0))) +
facet_grid(~covariate)
## Warning: Duplicated aesthetics after name standardisation: colour
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave('figures/visuomotor_control_wide.pdf', units='in')
## Saving 7 x 5 in image
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor_by_age_by_tracing_only <- d %>%
ungroup() %>%
filter(!is.na(avg_tracing_rating)) %>%
mutate(bin = ntile(avg_tracing_rating, 3)) %>%
group_by(bin, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric,bin) %>%
multi_boot_standard(col = "avg_cor")
## `summarise()` regrouping output by 'bin', 'age_numeric' (override with `.groups` argument)
ggplot(cor_by_age_by_tracing_only, aes(age_numeric,mean*100, color=bin, group=bin, col=bin)) +
geom_pointrange(aes(ymin = ci_lower*100, ymax = ci_upper*100), alpha=.6, size=.25) +
theme_few(base_size = 14) +
labs(x='Age of child drawing (yrs)', y='Category classification \n accuracy') +
scale_x_continuous(breaks=seq(2,10,1)) +
geom_smooth(span=10, alpha=smooth_alpha, size=.4) +
ylim(0,80) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey") +
theme(legend.position ='none',aspect.ratio = 1)
## Warning: Duplicated aesthetics after name standardisation: colour
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# scale_fill_discrete(labels=c('Low','Medium','High')) +
# theme(aspect.ratio = 1, legend.position = c(.08, .75),legend.text = element_text(size=6),legend.title = element_text(size=8),legend.background = element_rect(fill=alpha('white', 0)))
tracing_scores_raw <- d %>%
distinct(session_id,age_numeric,avg_tracing_rating) %>%
filter(!is.na(avg_tracing_rating))
tracing_scores <- d %>%
distinct(session_id,age_numeric,avg_tracing_rating) %>%
filter(!is.na(avg_tracing_rating)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_tracing_rating")
ggplot(tracing_scores, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
geom_jitter(height=.1, width=.4, data=tracing_scores_raw, aes(y=avg_tracing_rating, x=age_numeric), alpha=.02) +
theme_few(base_size = 14) +
labs(x='Age', y='Estimated tracing score') +
scale_color_viridis(option="D") +
scale_x_continuous(breaks=seq(2,10,1)) +
theme(legend.position = "none") +
ylim(0,4) +
geom_smooth(col='grey', span = 10,alpha=smooth_alpha)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1034 rows containing missing values (geom_point).
cor_by_session <- d %>%
group_by(age_numeric,session_id) %>%
# filter(age_numeric >2) %>%
dplyr::summarize(mean = mean(correct_or_not), num_drawings = n()) %>%
group_by(age_numeric)
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
base_size_chosen=10
cor_by_category <- d %>%
group_by(age_numeric,category) %>%
# filter(age_numeric >2) %>%
dplyr::summarize(mean = mean(correct_or_not), num_drawings = n()) %>%
group_by(age_numeric)
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
base_size_chosen=10
base_size_chosen=10
smooth_alpha=.01
ggplot(cor_by_age, aes(age_numeric,mean*100, col=age_numeric)) +
geom_jitter(data=cor_by_category, width=.1, height=0, alpha=.2) +
geom_pointrange(aes(y=mean*100, ymin = ci_lower*100, ymax = ci_upper*100)) +
geom_smooth(alpha=smooth_alpha, color='grey', span=10) +
theme_few(base_size = base_size_chosen) +
labs(x='Age of child drawing (yrs)', y='Drawing classification accuracy') +
scale_x_continuous(breaks = seq(2,10,1)) +
theme(legend.position = "none", aspect.ratio = 1) +
scale_color_viridis(option="D", breaks=seq(2,10,1)) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave('figures/cor_by_category.pdf', width=3, height=3, units='in')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(cor_by_age, aes(age_numeric,mean, col=age_numeric)) +
geom_jitter(data=cor_by_session, width=.1, height=.05, alpha=.01) +
geom_pointrange(aes(y=mean, ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(alpha=smooth_alpha, color='grey', span=10) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Classification accuracy') +
scale_x_continuous(breaks = seq(3,10,1)) +
theme(legend.position = "none", aspect.ratio = 1) +
scale_color_viridis(option="D", breaks=seq(3,10,1)) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave('figures/cor_by_session.pdf', width=3, height=3, units='in')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# ggplot(cor_by_session %>% filter(age_numeric > 2), aes(x=as.factor(age_numeric),y=mean, col=age_numeric, size=num_drawings)) +
# geom_flat_violin() +
# geom_point(position = position_jitter(width=.15, height = .01), size = .25)+
# theme_cowplot()+
# guides(fill = FALSE, colour = FALSE)
#
#
# # geom_jitter(data=cor_by_session, width=.1, height=.1, alpha=.01, aes(size=num_drawings)) +
# # geom_pointrange(aes(y=mean, ymin = ci_lower, ymax = ci_upper)) +
# geom_smooth(alpha=smooth_alpha, color='grey') +
# theme_few(base_size = base_size_chosen) +
# labs(x='Age', y='Classification accuracy') +
# scale_color_viridis(option="D") +
# theme(legend.position = "none", aspect.ratio = 1) +
# ylim(0,.75) +
# geom_hline(yintercept = 1/48, linetype="dashed", color="grey")
cor_by_category_by_age <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_log_odds = mean(log_odds), num_drawings = n()) %>%
left_join(freq_by_category) %>%
mutate(category = fct_reorder(category, drawing_frequency, .desc=TRUE))
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
## Joining, by = "category"
ggplot(cor_by_category_by_age, aes(age_numeric,avg_log_odds, color=drawing_frequency, size=num_drawings)) +
geom_point(alpha=.5) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Log odds') +
scale_color_viridis(option="A") +
theme(legend.position = "none") +
geom_smooth(span=10, alpha=smooth_alpha) +
# ylim(0,1) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey") +
facet_wrap(~category, nrow=6)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor_by_category_by_age <- d %>%
filter(age_numeric > 2) %>%
group_by(session_id, age_numeric,category) %>%
dplyr::summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric, category) %>%
multi_boot_standard(col = 'avg_cor') %>%
ungroup () %>%
left_join(freq_by_category) %>%
mutate(category = fct_reorder(category, drawing_frequency))
## `summarise()` regrouping output by 'session_id', 'age_numeric' (override with `.groups` argument)
## Joining, by = "category"
ggplot(cor_by_category_by_age, aes(age_numeric,mean, color=drawing_frequency)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), position=position_dodge(width=.2), alpha=.8) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Proportion correct') +
scale_color_viridis(option="A", begin=.4, end=.8) +
theme(legend.position = "none") +
geom_smooth(span=10, alpha=smooth_alpha) +
# ylim(0,1) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey") +
facet_wrap(~category, nrow=6)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# first summarize data
cor_by_age_low_freq <- d %>%
group_by(above_median_freq, age_numeric,category) %>%
summarize(avg_cor = mean(correct_or_not)) %>%
group_by(age_numeric, above_median_freq) %>%
multi_boot_standard(col = "avg_cor")
## `summarise()` regrouping output by 'above_median_freq', 'age_numeric' (override with `.groups` argument)
ggplot(cor_by_age_low_freq, aes(age_numeric,mean, color=above_median_freq)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Classification accuracy') +
# scale_color_viridis(option="D") +
theme(legend.position = "none") +
geom_smooth(col='grey',span=10, alpha=smooth_alpha) +
ylim(0,.75) +
geom_hline(yintercept = 1/48, linetype="dashed", color="grey")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
lo_correct_category_by_age <- d %>%
# filter(age_numeric > 2) %>%
filter(correct_or_not==1) %>%
mutate(category = str_replace(category,'ice.cream','ice cream')) %>% # ice.cream -> ice_cream
mutate(age = cut(age_numeric, c(1.9, 5, 8,10.1), labels = c("2-4","5-7","8-10"))) %>%
group_by(session_id, age,category) %>%
summarize(avg_cor = mean(log_odds), num_drawings = n()) %>%
group_by(age, category) %>%
multi_boot_standard(col = 'avg_cor') %>%
ungroup () %>%
mutate(category = fct_reorder(category, mean))
## `summarise()` regrouping output by 'session_id', 'age' (override with `.groups` argument)
num_classes=48
chance = log(1/num_classes) - log ((num_classes - 1)/num_classes)
base_size_chosen=10
ggplot(lo_correct_category_by_age, aes(x = category, y = mean, col = age)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), alpha=.8, size=.2) +
coord_flip() +
theme_few(base_size = base_size_chosen) +
labs(y = "Classifier evidence (log odds)", x = "") +
scale_color_viridis(discrete=TRUE, begin=0, end=.8, name = "Age group") +
theme(legend.position = c(.8,.25), axis.text.y = element_text(size=5), legend.text = element_text(size=8), legend.background = element_rect(fill=alpha('white', 0)), aspect.ratio=1)
## Warning: Removed 1 rows containing missing values (geom_segment).
ggsave("figures/log_odds_by_category.pdf", units = 'in', height= 3)
## Saving 7 x 3 in image
## Warning: Removed 1 rows containing missing values (geom_segment).
lo_correct_category_by_age_small <- lo_correct_category_by_age %>%
group_by(category) %>%
mutate(category_avg = mean(mean)) %>%
filter(category_avg > -1)
ggplot(lo_correct_category_by_age_small, aes(x = category, y = mean, col = age)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), alpha=.8, size=.2) +
coord_flip() +
theme_few(base_size = 12) +
labs(y = "Classifier evidence (log odds)", x = "") +
scale_color_viridis(discrete=TRUE, begin=0, end=.8, name = "Age group") +
theme(legend.position = 'none')
# theme(legend.position = c(.8,.25),legend.text = element_text(size=8), legend.background = element_rect(fill=alpha('white', 0)))
ggsave("figures/log_odds_by_category_small.pdf", width = 3, height = 3, units = 'in')
log_odds_by_age_cor_only <- d %>%
filter(correct_or_not==1) %>%
group_by(age_numeric,category) %>%
summarize(avg_log_odds = mean(log_odds)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_log_odds")
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
log_odds_by_age_cor_only_by_category <- d %>%
filter(correct_or_not==1) %>%
group_by(age_numeric,category) %>%
summarize(mean = mean(log_odds))
## `summarise()` regrouping output by 'age_numeric' (override with `.groups` argument)
base_size_chosen=8
ggplot(log_odds_by_age_cor_only, aes(age_numeric,mean, col=age_numeric)) +
# geom_jitter(data=log_odds_by_age_cor_only_by_category, width=.1, height=0, alpha=.3) +
geom_pointrange(aes(y=mean, ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(alpha=smooth_alpha, color='grey', span=10) +
theme_few(base_size = base_size_chosen) +
labs(x='Age', y='Classification evidence') +
scale_x_continuous(breaks = seq(2,10,1)) +
theme(legend.position = "none", aspect.ratio = 1) +
scale_color_viridis(option="D", breaks=seq(2,10,1))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# geom_hline(yintercept = 1/48, linetype="dashed", color="grey")
ggsave('figures/cor_only_log_odds_by_category.pdf', width=3, height=3, units='in')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
dog_probs <- classification_data %>%
select(-image_name, -batch, -batch_str, -X.1, -index, -age, -target_label,-session_id, -correct_or_not, -age_numeric) %>%
filter(category == 'dog') %>%
gather(key = category, value = prob) %>%
group_by(category) %>%
mutate(mean_prob = mean(prob)) %>%
ungroup() %>%
mutate(category = fct_reorder(category, mean_prob, .desc=TRUE))
ggplot(dog_probs, aes(x=category, y=prob)) +
geom_boxplot(alpha=.2) +
theme_few() +
theme(axis.text.x = element_text(angle = 90))
classification_data_long <- classification_data %>%
mutate(correct_or_not = as.logical(correct_or_not)) %>%
gather(key = 'class', value = 'prob', contains('prob')) %>%
mutate(class = str_split_fixed(class, '_prob',2)[,1])
confusions_by_age <- classification_data_long %>%
mutate(drawn_category = category) %>%
left_join(animacy_csv) %>%
ungroup() %>%
filter(correct_or_not==0) %>%
filter(age_numeric > 2) %>%
mutate(drawn_category = fct_reorder(drawn_category, animacy_size)) %>%
mutate(class = factor(class, levels = levels(drawn_category))) %>%
mutate(age_group = cut(age_numeric, c(2.9, 5, 7,10.1), labels = c("3-4","5-6","7-10"))) %>%
group_by(age_group, age_numeric, drawn_category, class) %>%
dplyr::summarize(mean_prob = mean(prob))
## Joining, by = "category"
## `summarise()` regrouping output by 'age_group', 'age_numeric', 'drawn_category' (override with `.groups` argument)
confusions_by_class <- classification_data_long %>%
mutate(drawn_category = category) %>%
left_join(animacy_csv) %>%
ungroup() %>%
filter(correct_or_not==0) %>%
# filter(age_numeric > 2) %>%
mutate(drawn_category = fct_reorder(drawn_category, animacy_size)) %>%
mutate(class = factor(class, levels = levels(drawn_category))) %>%
# mutate(age_group = cut(age_numeric, c(2.9, 5, 7,10.1), labels = c("3-4","5-6","7-10"))) %>%
group_by(drawn_category, class) %>%
dplyr::summarize(mean_prob = mean(prob))
## Joining, by = "category"
## `summarise()` regrouping output by 'drawn_category' (override with `.groups` argument)
levels(confusions_by_age$drawn_category)
## [1] "airplane" "bed" "bike" "boat" "cactus"
## [6] "car" "chair" "couch" "house" "piano"
## [11] "train" "tree" "TV" "apple" "book"
## [16] "bottle" "bowl" "clock" "cup" "hat"
## [21] "ice.cream" "key" "lamp" "mushroom" "phone"
## [26] "scissors" "watch" "bee" "bird" "cat"
## [31] "dog" "face" "fish" "frog" "hand"
## [36] "rabbit" "snail" "spider" "bear" "camel"
## [41] "cow" "elephant" "horse" "octopus" "person"
## [46] "sheep" "tiger" "whale"
sanity <- confusions_by_class %>%
group_by(drawn_category) %>%
summarise(sum_prob = sum(mean_prob))
## `summarise()` ungrouping output (override with `.groups` argument)
confusions_count_by_class <- classification_data_long %>%
mutate(drawn_category = category) %>%
left_join(animacy_csv) %>%
ungroup() %>%
group_by(image_name) %>%
filter(correct_or_not==0) %>%
filter(prob == max(prob)) %>%
group_by(drawn_category) %>%
mutate(count_category = n()) %>%
ungroup() %>%
group_by(drawn_category, class) %>%
summarise(confusions = n(), count_category = count_category[1], animacy_size = animacy_size[1]) %>%
mutate(prop_confusions = confusions / count_category) %>%
ungroup() %>%
mutate(drawn_category = fct_reorder(drawn_category, animacy_size)) %>%
mutate(class = factor(class, levels = levels(drawn_category)))
## Joining, by = "category"
## `summarise()` regrouping output by 'drawn_category' (override with `.groups` argument)
ggplot(data = confusions_count_by_class, aes(x=class, y=drawn_category,fill=prop_confusions)) +
geom_tile() +
theme_few(base_size = 12) +
theme(legend.position = 'left', axis.text.x = element_text(angle = 90,
size = 6, vjust=.5, hjust = 1), axis.text.y = element_text(angle = 0,
size = 6, hjust = 1)) +
coord_fixed() +
scale_fill_viridis(option="A", name = 'Proportion classified') +
ylab('Drawn as') +
xlab('Confused with')
# facet_wrap(~age_group, nrow=1)
ggsave('figures/classifier_confusions_count.pdf', width = 4, height = 5, units = 'in')
ggplot(data = confusions_by_class, aes(x=class, y=drawn_category,fill=mean_prob)) +
geom_tile() +
theme_few(base_size = 12) +
theme(legend.position = 'left', axis.text.x = element_text(angle = 90,
size = 6, vjust=.5, hjust = 1), axis.text.y = element_text(angle = 0,
size = 6, hjust = 1)) +
coord_fixed() +
scale_fill_viridis(option="A", limits=c(0,quantile(confusions_by_age$mean_prob,.998)), name = 'Classifier probability') +
ylab('Drawn as') +
xlab('Confused with')
# facet_wrap(~age_group, nrow=1)
ggsave('figures/classifier_confusions.pdf', width = 4, height = 5, units = 'in')
animacy_size_acc_by_age <- classification_data_long %>%
mutate(drawn_category = category) %>%
left_join(animacy_csv %>% select(animacy, size, category), by = c("drawn_category" = "category")) %>%
ungroup() %>%
rename(drawn_category_animacy = animacy, drawn_category_size = size) %>%
left_join(animacy_csv %>% select(animacy, size, category), by = c("class" = "category")) %>%
rename(class_animacy = animacy, class_size = size) %>%
group_by(image_name, drawn_category, age_numeric) %>%
mutate(max_prob = max(prob), top_class = prob==max_prob) %>%
filter(top_class==TRUE) %>%
mutate(animacy_correct = (drawn_category_animacy == class_animacy)) %>%
mutate(size_correct = (drawn_category_size == class_size))
# anim_cor_by_category_by_age <- animacy_size_acc_by_age %>%
# filter(correct_or_not==0) %>%
# group_by(age_numeric,drawn_category, drawn_category_animacy) %>%
# summarize(avg_animacy_correct = mean(animacy_correct), num_drawings = n()) %>%
# ungroup() %>%
# mutate(drawn_category = fct_reorder(drawn_category, drawn_category_animacy))
#
# ggplot(anim_cor_by_category_by_age, aes(age_numeric,avg_animacy_correct, color=drawn_category_animacy, size=num_drawings)) +
# geom_point(alpha=.5) +
# theme_few(base_size = base_size_chosen) +
# labs(x='Age', y='Proportion animacy correct') +
# theme(legend.position = "none") +
# geom_smooth(span=10, alpha=smooth_alpha) +
# ylim(0,1) +
# geom_hline(yintercept = .43, linetype="dashed", color="grey") +
# facet_wrap(~drawn_category, nrow=6)
baseline_animacy <- mean(animacy_csv$animacy)
baseline_objects <- 1 - mean(animacy_csv$animacy)
anim_cor_by_category_by_age <- animacy_size_acc_by_age %>%
filter(correct_or_not==0) %>%
mutate(drawn_category_animacy = as.factor(drawn_category_animacy)) %>%
mutate(baseline_chance = case_when(drawn_category_animacy == 0 ~ baseline_objects,
drawn_category_animacy == 1 ~ baseline_animacy)) %>%
group_by(age_numeric, drawn_category) %>%
dplyr::summarize(avg_animacy_correct = mean(animacy_correct) - baseline_chance) %>%
distinct(age_numeric, avg_animacy_correct, drawn_category)
## `summarise()` regrouping output by 'age_numeric', 'drawn_category' (override with `.groups` argument)
anim_cor_by_age <- anim_cor_by_category_by_age %>%
group_by(age_numeric) %>%
multi_boot_standard(col = 'avg_animacy_correct')
base_rate = .5
diffs=array()
for (c in 1:48){
rands = sample(0:1,100, replace=TRUE)
diff = mean(rands) - base_rate
diffs[c]=diff
}
ggplot(anim_cor_by_age, aes(x=age_numeric,y=mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), alpha=.8, size=.2) +
geom_point(data=anim_cor_by_category_by_age, aes(x=age_numeric, y=avg_animacy_correct), alpha=.1, size=.5) +
theme_few(base_size = 8) +
labs(x='Age', y='Prop. animacy correct') +
theme(legend.position = "none") +
# geom_smooth(span=10, alpha=smooth_alpha, color='grey') +
scale_color_viridis(option="D", discrete=FALSE) +
geom_hline(yintercept = 0 , linetype="dashed", color="grey")
ggsave('figures/animacy_classification.pdf', width = 1.5, height = 1.5, units = 'in')
objects_only <- animacy_csv %>%
filter(animacy==0)
baseline_big <- mean(objects_only$size)
baseline_small <- 1 - mean(objects_only$size)
size_cor_by_category_by_age <- animacy_size_acc_by_age %>%
filter(correct_or_not==0) %>%
filter(drawn_category_animacy==0) %>% #only objects
mutate(drawn_category_size = as.factor(drawn_category_size)) %>%
mutate(baseline_chance = case_when(drawn_category_size == 0 ~ baseline_small,
drawn_category_size == 1 ~ baseline_big)) %>%
group_by(age_numeric, drawn_category) %>%
dplyr::summarize(avg_size_correct = mean(size_correct) - baseline_chance) %>%
distinct(age_numeric, avg_size_correct, drawn_category)
## `summarise()` regrouping output by 'age_numeric', 'drawn_category' (override with `.groups` argument)
size_cor_by_age <- size_cor_by_category_by_age %>%
group_by(age_numeric) %>%
multi_boot_standard(col = 'avg_size_correct')
## Plot 4: Plot animacy correct by age
ggplot(size_cor_by_age, aes(x=age_numeric,y=mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), alpha=.8, size=.2) +
geom_point(data=size_cor_by_category_by_age, aes(x=age_numeric, y=avg_size_correct), alpha=.1, size=.5) +
theme_few(base_size = 8) +
labs(x='Age', y='Prop. object size correct') +
theme(legend.position = "none") +
# geom_smooth(span=10, alpha=smooth_alpha, color='grey') +
scale_color_viridis(option="D", discrete=FALSE) +
geom_hline(yintercept = 0 , linetype="dashed", color="grey")
ggsave('figures/size_classification.pdf', width = 1.5, height = 1.5, units = 'in')
accuracy_age_only <- glmer(correct_or_not ~ age_numeric +
(1|session_id) +
(1|category),
data = d)
## Warning in glmer(correct_or_not ~ age_numeric + (1 | session_id) + (1
## | : calling glmer() with family=gaussian (identity link) as a shortcut to
## lmer() is deprecated; please call lmer() directly
summary(accuracy_age_only)
## Linear mixed model fit by REML ['lmerMod']
## Formula: correct_or_not ~ age_numeric + (1 | session_id) + (1 | category)
## Data: d
##
## REML criterion at convergence: 26541
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.9131 -0.7692 -0.3712 1.0259 2.4535
##
## Random effects:
## Groups Name Variance Std.Dev.
## session_id (Intercept) 0.01010 0.1005
## category (Intercept) 0.02744 0.1657
## Residual 0.18161 0.4262
## Number of obs: 22272, groups: session_id, 7032; category, 48
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.175348 0.025241 6.947
## age_numeric 0.029284 0.001393 21.016
##
## Correlation of Fixed Effects:
## (Intr)
## age_numeric -0.295
accuracy_with_drawing_freq <- glmer(correct_or_not ~ scale(age_numeric) +
scale(drawing_frequency) + (1|session_id) +
(age_numeric|category),
data = d, family="binomial")
accuracy_with_drawing_freq = summary(accuracy_with_drawing_freq)
xtable::xtable(summary(accuracy_with_drawing_freq)$coef, digits=3, caption = "Model coefficients of a GLMM predicting the recognziability of each drawing.")
## % latex table generated in R 3.6.1 by xtable 1.8-4 package
## % Tue Mar 30 11:02:08 2021
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrr}
## \hline
## & Estimate & Std. Error & z value & Pr($>$$|$z$|$) \\
## \hline
## (Intercept) & -1.015 & 0.197 & -5.153 & 0.000 \\
## scale(age\_numeric) & 0.319 & 0.032 & 9.827 & 0.000 \\
## scale(drawing\_frequency) & -0.078 & 0.150 & -0.520 & 0.603 \\
## \hline
## \end{tabular}
## \caption{Model coefficients of a GLMM predicting the recognziability of each drawing.}
## \end{table}
We first examined how classification accuracy varied according to the age of the child who produced each drawing as well as the category that was drawn. As expected, we found that classification accuracy based on these visual features increased steadily with the age of the child producing the drawing (STATS, see Figure XX), validating the expectation that older children’s drawings contain visual features that make them more recognizable. Importantly, we observed this developmental trend for many different categories that varied in the degree to which they are commonly drawn by children (see SI Appendix, Figure XX): for example, some object categories in our dataset are frequently drawn by children (e.g., car, tree, person) and others very infrequently drawn (e.g., cactus, whale, scissors). We formally evaluated this by asking parents of children aged 3-10 years to estimate the frequency with their child draws each category (N=50 parents, ) and directly examining how it affected classification performance in a second generalized linear mixed model (), adding this term as a covariate. We did not observe that classification performance was influenced by drawing frequency (STATS) in fact, many infrequently drawn categories (e.g. piano) had relatively high classification rates, and some frequently drawn categories (e.g. dog), had relatively low classification rates and were more likely to be confused with other similar categories (e.g., other animals). Thus, children increasingly include distinctive visual features of object categories in their drawings across childhood, regardless of whether these are objects that they have significant experience drawing or objects that they may have never drawn before.
correct_subset <- d %>%
filter(correct_or_not==1)
log_odds_corr_only <- lmer(log_odds ~ age_numeric +
scale(drawing_frequency) +
(1|session_id) +
(1|category) + (1|run),
data =correct_subset)
## boundary (singular) fit: see ?isSingular
xtable::xtable(summary(log_odds_corr_only)$coef, digits=3, caption = "Model coefficients of a GLMM predicting the 'distinctiveness' (i.e. log-odds probability of selecting the correct label) assigned to correctly classified drawings")
## % latex table generated in R 3.6.1 by xtable 1.8-4 package
## % Tue Mar 30 11:02:10 2021
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrr}
## \hline
## & Estimate & Std. Error & df & t value & Pr($>$$|$t$|$) \\
## \hline
## (Intercept) & -1.342 & 0.110 & 52.144 & -12.240 & 0.000 \\
## age\_numeric & 0.061 & 0.006 & 3445.384 & 10.468 & 0.000 \\
## scale(drawing\_frequency) & 0.022 & 0.109 & 42.355 & 0.199 & 0.844 \\
## \hline
## \end{tabular}
## \caption{Model coefficients of a GLMM predicting the 'distinctiveness' (i.e. log-odds probability of selecting the correct label) assigned to correctly classified drawings}
## \end{table}
We thus restricted our analysis to drawings that the model was able to correclty classify (33% of the balanced subset of drawings, N=7468
accuracy_all_covariates <- glmer(correct_or_not ~ scale(avg_tracing_rating)*scale(age_numeric) +
scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
accuracy_all_covariates_no_int <- glmer(correct_or_not ~ scale(avg_tracing_rating)+scale(age_numeric) +
scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
accuracy_no_age <- glmer(correct_or_not ~ scale(avg_tracing_rating) + scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
accuracy_no_age_or_tracing <- glmer(correct_or_not ~ scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
accuracy_no_tracing <- glmer(correct_or_not ~ scale(age_numeric) +
scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
###
null = r.squaredGLMM(accuracy_no_age_or_tracing)
## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help
## page.
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
no_age = r.squaredGLMM(accuracy_no_age)
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
no_tracing = r.squaredGLMM(accuracy_no_tracing)
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
all = r.squaredGLMM(accuracy_all_covariates)
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
no_int = r.squaredGLMM(accuracy_all_covariates_no_int) # no_int = no interaction between tracing/age
## Warning: The null model is correct only if all variables used by the
## original model remain unchanged.
all_covariates_accuracy <- glmer(correct_or_not ~ scale(age_numeric)* scale(drawing_frequency)+ scale(avg_tracing_rating) + scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d, family="binomial")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.00549854
## (tol = 0.001, component 1)
all_covariates_log_odds <- lmer(log_odds ~ scale(age_numeric)+ scale(avg_tracing_rating) + scale(draw_duration) +
scale(mean_intensity) +
scale(num_strokes) +
scale(drawing_frequency) +
(1|session_id) +
(1|category),
data = d)
xtable::xtable(summary(all_covariates_accuracy)$coef, digits=3, caption = "Model coefficients of a GLMM predicting whetjery of selecting the correct label) assigned to correctly classified drawings")
## % latex table generated in R 3.6.1 by xtable 1.8-4 package
## % Tue Mar 30 11:05:13 2021
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrr}
## \hline
## & Estimate & Std. Error & z value & Pr($>$$|$z$|$) \\
## \hline
## (Intercept) & -1.015 & 0.199 & -5.098 & 0.000 \\
## scale(age\_numeric) & 0.238 & 0.020 & 11.904 & 0.000 \\
## scale(drawing\_frequency) & -0.132 & 0.199 & -0.662 & 0.508 \\
## scale(avg\_tracing\_rating) & 0.262 & 0.020 & 12.868 & 0.000 \\
## scale(draw\_duration) & 0.068 & 0.021 & 3.219 & 0.001 \\
## scale(mean\_intensity) & -0.071 & 0.021 & -3.412 & 0.001 \\
## scale(num\_strokes) & 0.016 & 0.018 & 0.886 & 0.376 \\
## scale(age\_numeric):scale(drawing\_frequency) & 0.002 & 0.017 & 0.119 & 0.905 \\
## \hline
## \end{tabular}
## \caption{Model coefficients of a GLMM predicting whetjery of selecting the correct label) assigned to correctly classified drawings}
## \end{table}
out_log_odds = round(summary(all_covariates_log_odds)$coef,3)